import pandas as pd
pd.options.mode.copy_on_write = True
import numpy as np
from itables import init_notebook_mode
init_notebook_mode(all_interactive=False)
from itables import show
from PIL import Image
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pd.options.display.max_columns = NoneQuarto Basics
First we load in the libraries and set defaults
Now we read in the data files
years=list(range(1991, 2017))
years=[str(x) + ".txt" for x in years]years=["Elements " + s for s in years]
years = ["C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/" + s for s in years]
years['C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1991.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1992.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1993.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1994.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1995.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1996.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1997.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1998.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1999.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2000.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2001.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2002.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2003.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2004.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2005.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2006.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2007.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2008.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2009.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2010.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2011.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2012.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2013.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2014.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2015.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2016.txt']
files = years + ["C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2017 MB 1-3.txt", "C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2017 MB 4.txt"]
files['C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1991.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1992.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1993.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1994.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1995.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1996.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1997.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1998.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1999.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2000.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2001.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2002.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2003.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2004.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2005.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2006.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2007.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2008.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2009.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2010.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2011.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2012.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2013.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2014.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2015.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2016.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2017 MB 1-3.txt',
'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2017 MB 4.txt']
data_read = []
for file in files:
data_read.append(pd.read_table(file, encoding = 'latin-1'))
data_read.append(pd.read_csv("C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/2018_2020.csv"))df = pd.concat(data_read)
df.shape(429946, 37)
df.dtypesMB float64
Food No. float64
Food Name object
Anal Type object
Sample Qualifier object
Replicate No. object
Element object
Conc float64
Unit object
Trace object
LOD float64
LOQ float64
Result Qualifier and Remarks object
Method object
Instrument object
Batch ID object
METHOD object
Food No float64
ReplicateNo object
Reference Material object
QC Level object
QC Unit object
QC% Recvd object
City ID object
Replicate No float64
FiscalYear float64
CalendarYear float64
Collection float64
TDSFoodNumber float64
TDSFoodDescription object
Season object
TDSFoodListType object
Region object
Analyte object
Units object
Concentration float64
ReportingLimit float64
dtype: object
Exploring the data and doing some clean up
df.columns = df.columns.str.replace(" ", ".")
df.columnsIndex(['MB', 'Food.No.', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
'Replicate.No.', 'Element', 'Conc', 'Unit', 'Trace', 'LOD', 'LOQ',
'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
'METHOD', 'Food.No', 'ReplicateNo', 'Reference.Material', 'QC.Level',
'QC.Unit', 'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear',
'CalendarYear', 'Collection', 'TDSFoodNumber', 'TDSFoodDescription',
'Season', 'TDSFoodListType', 'Region', 'Analyte', 'Units',
'Concentration', 'ReportingLimit'],
dtype='object')
df['Food.Name']=df['Food.Name'].str.replace('w/', 'with').str.upper()
#df['Food.Name'].unique()df['Unit']=df['Unit'].str.replace(' ', '')
df['Unit'].unique()array(['mg/kg', '%', 'ug/kg', nan], dtype=object)
Trying to understand the dataset. What is missing?
df.isnull().sum()MB 58079
Food.No. 90743
Food.Name 58079
Anal.Type 58079
Sample.Qualifier 307204
Replicate.No. 354905
Element 58079
Conc 58079
Unit 58079
Trace 370168
LOD 59909
LOQ 59909
Result.Qualifier.and.Remarks 319556
Method 137807
Instrument 58081
Batch.ID 173749
METHOD 350220
Food.No 397282
ReplicateNo 415226
Reference.Material 415226
QC.Level 415226
QC.Unit 415226
QC%.Recvd 415226
City.ID 429833
Replicate.No 429469
FiscalYear 371867
CalendarYear 371867
Collection 371867
TDSFoodNumber 371867
TDSFoodDescription 371867
Season 371867
TDSFoodListType 371867
Region 371868
Analyte 371868
Units 371868
Concentration 371868
ReportingLimit 371868
dtype: int64
What are the null entries? Trying to figure out if the difference in null units between Unit and Units
(pd.isnull(df['Units']) & pd.isnull(df['Unit'])).sum()1
So there are is one row where Unit and Units are empty
df = df[~(pd.isnull(df['Units']) & pd.isnull(df['Unit']))].copy()df.shape(429945, 37)
pd.isnull(df['Units']).sum()371867
pd.isnull(df['Unit']).sum()58078
pd.isnull(df['Units']).sum() + pd.isnull(df['Unit']).sum()429945
371867+58078429945
So we are going to simply substitute for Unit and drop [‘Units’]
df.loc[df['Unit'].isnull()]['Unit'] = df.loc[df['Unit'].isnull()]['Units']C:\Users\carte\AppData\Local\Temp\ipykernel_12728\2905289810.py:1: ChainedAssignmentError:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment.
When using the Copy-on-Write mode, such chained assignment never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.
Try using '.loc[row_indexer, col_indexer] = value' instead, to perform the assignment in a single step.
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df = df.drop(['Units'], axis = 1)(pd.isnull(df['Conc']) & pd.isnull(df['Concentration'])).sum()0
df['Conc'].isnull().sum()58078
df['Concentration'].isnull().sum()371867
371867 + 58078429945
So just like above we need to substitute for Conc
df.loc[df['Conc'].isnull()]['Conc'] = df.loc[df['Conc'].isnull()]['Concentration']C:\Users\carte\AppData\Local\Temp\ipykernel_12728\3239790615.py:1: ChainedAssignmentError:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment.
When using the Copy-on-Write mode, such chained assignment never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.
Try using '.loc[row_indexer, col_indexer] = value' instead, to perform the assignment in a single step.
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df = df.drop(['Concentration'], axis = 1)(pd.isnull(df['Element']) & pd.isnull(df['Analyte'])).sum()0
df['Element'].isnull().sum()58078
df['Analyte'].isnull().sum()371867
58078 + 371867429945
df.loc[df['Element'].isnull()]['Element'] = df.loc[df['Element'].isnull()]['Analyte']C:\Users\carte\AppData\Local\Temp\ipykernel_12728\695756292.py:1: ChainedAssignmentError:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment.
When using the Copy-on-Write mode, such chained assignment never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.
Try using '.loc[row_indexer, col_indexer] = value' instead, to perform the assignment in a single step.
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df = df.drop(['Analyte'], axis = 1)df.columnsIndex(['MB', 'Food.No.', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
'Replicate.No.', 'Element', 'Conc', 'Unit', 'Trace', 'LOD', 'LOQ',
'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
'METHOD', 'Food.No', 'ReplicateNo', 'Reference.Material', 'QC.Level',
'QC.Unit', 'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear',
'CalendarYear', 'Collection', 'TDSFoodNumber', 'TDSFoodDescription',
'Season', 'TDSFoodListType', 'Region', 'ReportingLimit'],
dtype='object')
df.shape(429945, 34)
We need to also fix the Food No issue. There are two columns named Food No that we need to edit similar to what we did above. However, this time we need to index with integers
df.iloc[:, [1, 17]]| Food.No. | Food.No | |
|---|---|---|
| 0 | 1.0 | NaN |
| 1 | 1.0 | NaN |
| 2 | 1.0 | NaN |
| 3 | 1.0 | NaN |
| 4 | 1.0 | NaN |
| ... | ... | ... |
| 58073 | NaN | NaN |
| 58074 | NaN | NaN |
| 58075 | NaN | NaN |
| 58076 | NaN | NaN |
| 58077 | NaN | NaN |
429945 rows × 2 columns
df.iloc[:, 1].isnull().sum() + df.iloc[:, 17].isnull().sum()488023
df.iloc[:, 18].isnull().sum()415225
df.loc[df.iloc[:, 1].isnull()].iloc[:, 1] = df.loc[df.iloc[:, 1].isnull()].iloc[:, 17]C:\Users\carte\AppData\Local\Temp\ipykernel_12728\841977522.py:1: ChainedAssignmentError:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment.
When using the Copy-on-Write mode, such chained assignment never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.
Try using '.loc[row_indexer, col_indexer] = value' instead, to perform the assignment in a single step.
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#df = df.drop(df.columns[17], axis=1)df.isnull().sum()MB 58078
Food.No. 90742
Food.Name 58078
Anal.Type 58078
Sample.Qualifier 307203
Replicate.No. 354904
Element 58078
Conc 58078
Unit 58078
Trace 370167
LOD 59908
LOQ 59908
Result.Qualifier.and.Remarks 319555
Method 137806
Instrument 58080
Batch.ID 173748
METHOD 350219
ReplicateNo 415225
Reference.Material 415225
QC.Level 415225
QC.Unit 415225
QC%.Recvd 415225
City.ID 429832
Replicate.No 429468
FiscalYear 371867
CalendarYear 371867
Collection 371867
TDSFoodNumber 371867
TDSFoodDescription 371867
Season 371867
TDSFoodListType 371867
Region 371867
ReportingLimit 371867
dtype: int64
df.columnsIndex(['MB', 'Food.No.', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
'Replicate.No.', 'Element', 'Conc', 'Unit', 'Trace', 'LOD', 'LOQ',
'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
'METHOD', 'ReplicateNo', 'Reference.Material', 'QC.Level', 'QC.Unit',
'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear', 'CalendarYear',
'Collection', 'TDSFoodNumber', 'TDSFoodDescription', 'Season',
'TDSFoodListType', 'Region', 'ReportingLimit'],
dtype='object')
df['Conc_mgkg'] = np.where(df['Unit']=="ug/kg", df['Conc']/1000, df['Conc'])
df['Conc_mgkg'] = np.where(df['Unit']=="ppb", df['Conc']/1000, df['Conc'])df = df.drop(['Conc'], axis = 1)df['Element']=df.Element.str.upper()
df['Element'].unique()array(['ARSENIC', 'CADMIUM', 'CALCIUM', 'COPPER', 'IRON', 'LEAD',
'MAGNESIUM', 'MERCURY', 'PHOSPHORUS', 'POTASSIUM', 'SELENIUM',
'SODIUM', 'ZINC', 'MANGANESE', 'NICKEL', 'IODINE', 'ANTIMONY',
'ALUMINUM', 'MOLYBDENUM', 'WATER (ANALYSIS FOR TOTAL DIET ONLY)',
'BARIUM', 'BERYLLIUM', 'CHROMIUM', 'DIMETHYLARSINIC ACID ',
'MONOMETHYLARSONIC ACID ', 'SILVER', 'THALLIUM',
'TOTAL INORGANIC ARSENIC ', 'URANIUM', 'DIMETHYLARSINIC ACID',
'MONOMETHYLARSONIC ACID', 'STRONTIUM', 'TOTAL INORGANIC ARSENIC',
'VANADIUM', nan], dtype=object)
df_percent = df.query('Unit == "%"').copy()Looks like all the % entries are water analysis
df=df.query('Unit != "%"').copy()df=df.rename(columns={"Food.No.": "Food.No",
"Replicate.No.": "Replicate.No"})df.Element.unique()array(['ARSENIC', 'CADMIUM', 'CALCIUM', 'COPPER', 'IRON', 'LEAD',
'MAGNESIUM', 'MERCURY', 'PHOSPHORUS', 'POTASSIUM', 'SELENIUM',
'SODIUM', 'ZINC', 'MANGANESE', 'NICKEL', 'IODINE', 'ANTIMONY',
'ALUMINUM', 'MOLYBDENUM', 'BARIUM', 'BERYLLIUM', 'CHROMIUM',
'DIMETHYLARSINIC ACID ', 'MONOMETHYLARSONIC ACID ', 'SILVER',
'THALLIUM', 'TOTAL INORGANIC ARSENIC ', 'URANIUM',
'DIMETHYLARSINIC ACID', 'MONOMETHYLARSONIC ACID', 'STRONTIUM',
'TOTAL INORGANIC ARSENIC', 'VANADIUM', nan], dtype=object)
elements = ['ARSENIC', 'CADMIUM', 'CALCIUM', 'COPPER', 'IRON', 'LEAD',
'MAGNESIUM', 'PHOSPHORUS', 'POTASSIUM', 'SELENIUM',
'SODIUM', 'ZINC', 'MANGANESE', 'NICKEL', 'MOLYBDENUM', 'BARIUM', 'CHROMIUM',
'THALLIUM',
'STRONTIUM',
]elements['ARSENIC',
'CADMIUM',
'CALCIUM',
'COPPER',
'IRON',
'LEAD',
'MAGNESIUM',
'PHOSPHORUS',
'POTASSIUM',
'SELENIUM',
'SODIUM',
'ZINC',
'MANGANESE',
'NICKEL',
'MOLYBDENUM',
'BARIUM',
'CHROMIUM',
'THALLIUM',
'STRONTIUM']
df = df[df['Element'].isin(elements)].copy()df.columnsIndex(['MB', 'Food.No', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
'Replicate.No', 'Element', 'Unit', 'Trace', 'LOD', 'LOQ',
'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
'METHOD', 'ReplicateNo', 'Reference.Material', 'QC.Level', 'QC.Unit',
'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear', 'CalendarYear',
'Collection', 'TDSFoodNumber', 'TDSFoodDescription', 'Season',
'TDSFoodListType', 'Region', 'ReportingLimit', 'Conc_mgkg'],
dtype='object')
To understand what the columns are referring to see https://www.fda.gov/food/fda-total-diet-study-tds/fda-total-diet-study-tds-results and https://www.fda.gov/media/93359/download?attachment
img = Image.open('C:/Users/carte/OneDrive/Documents/GitHub/FDA-TDS/elements_column_key.PNG')
display(img)
Some summarizing
df.groupby('Element').count()| MB | Food.No | Food.Name | Anal.Type | Sample.Qualifier | Replicate.No | Unit | Trace | LOD | LOQ | Result.Qualifier.and.Remarks | Method | Instrument | Batch.ID | METHOD | ReplicateNo | Reference.Material | QC.Level | QC.Unit | QC%.Recvd | City.ID | Replicate.No | FiscalYear | CalendarYear | Collection | TDSFoodNumber | TDSFoodDescription | Season | TDSFoodListType | Region | ReportingLimit | Conc_mgkg | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Element | ||||||||||||||||||||||||||||||||
| ARSENIC | 25228 | 23336 | 25228 | 25228 | 7309 | 4441 | 25228 | 3138 | 25220 | 25220 | 7142 | 20069 | 25228 | 16917 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25228 |
| BARIUM | 15 | 11 | 15 | 15 | 15 | 11 | 15 | 0 | 15 | 15 | 14 | 15 | 15 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 |
| CADMIUM | 25501 | 23609 | 25501 | 25501 | 8010 | 5140 | 25501 | 5397 | 25501 | 25501 | 7933 | 20342 | 25501 | 17375 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25501 |
| CALCIUM | 24411 | 22519 | 24411 | 24411 | 7578 | 4748 | 24411 | 384 | 24411 | 24411 | 5944 | 19252 | 24411 | 16087 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24411 |
| CHROMIUM | 3807 | 2923 | 3807 | 3807 | 3722 | 1866 | 3807 | 171 | 3807 | 3807 | 3720 | 3807 | 3807 | 2747 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3807 |
| COPPER | 24418 | 22526 | 24418 | 24418 | 7562 | 4723 | 24418 | 11948 | 24418 | 24418 | 5954 | 19259 | 24418 | 16095 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24418 |
| IRON | 24427 | 22535 | 24427 | 24427 | 7612 | 4773 | 24427 | 6626 | 24427 | 24427 | 5953 | 19267 | 24426 | 16095 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24427 |
| LEAD | 26414 | 24458 | 26414 | 26414 | 8166 | 5227 | 26414 | 4791 | 26414 | 26414 | 7725 | 20833 | 26414 | 18282 | 5581 | 1072 | 1072 | 1072 | 1072 | 1072 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26414 |
| MAGNESIUM | 24418 | 22526 | 24418 | 24418 | 7561 | 4714 | 24418 | 583 | 24418 | 24418 | 5942 | 19259 | 24418 | 16088 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24418 |
| MANGANESE | 25225 | 23332 | 25225 | 25225 | 8363 | 5497 | 25225 | 4502 | 25225 | 25225 | 6751 | 20066 | 25225 | 16893 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25225 |
| MOLYBDENUM | 5832 | 4951 | 5832 | 5832 | 4760 | 2909 | 5832 | 412 | 4817 | 4817 | 5742 | 4807 | 5831 | 4752 | 1024 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5832 |
| NICKEL | 18665 | 16772 | 18665 | 18665 | 4861 | 3044 | 18665 | 5670 | 18664 | 18664 | 5438 | 13505 | 18665 | 15513 | 5160 | 1008 | 1008 | 1008 | 1008 | 1008 | 2 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18665 |
| PHOSPHORUS | 24413 | 22525 | 24413 | 24413 | 7544 | 4702 | 24413 | 569 | 24413 | 24413 | 5940 | 19254 | 24413 | 16085 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24413 |
| POTASSIUM | 25214 | 23322 | 25214 | 25214 | 8342 | 5498 | 25214 | 336 | 25214 | 25214 | 6741 | 20055 | 25214 | 16885 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25214 |
| SELENIUM | 21325 | 20440 | 21325 | 21325 | 3568 | 1714 | 21325 | 3931 | 21325 | 21325 | 3621 | 16166 | 21325 | 13089 | 5159 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21325 |
| SODIUM | 25215 | 23323 | 25215 | 25215 | 8362 | 5521 | 25215 | 2974 | 25215 | 25215 | 6740 | 20057 | 25215 | 16884 | 5158 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25215 |
| STRONTIUM | 889 | 5 | 889 | 889 | 787 | 0 | 889 | 281 | 889 | 889 | 782 | 889 | 889 | 859 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 889 |
| THALLIUM | 15 | 11 | 15 | 15 | 15 | 11 | 15 | 5 | 15 | 15 | 14 | 15 | 15 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 |
| ZINC | 24430 | 22538 | 24430 | 24430 | 7591 | 4747 | 24430 | 3147 | 24430 | 24430 | 5953 | 19271 | 24430 | 16095 | 5159 | 1008 | 1008 | 1008 | 1008 | 1008 | 0 | 30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24430 |
df.MB0 199103.0
1 199103.0
2 199103.0
3 199103.0
4 199103.0
...
1805 201704.0
1806 201704.0
1807 201704.0
1808 201704.0
1809 201704.0
Name: MB, Length: 349862, dtype: float64
From column key, we see the first four digits of MB are the year. The last two digits are the sepcific market basket. Let’s extract this information
df['Year'] = df.MB.astype(str).str[:4].astype(int)df['Market_Basket'] = df.MB.astype(str).str[-4:]df.shape(349862, 35)
28200 + 321662349862
After working with the features let’s see what is still missing in the data
df.isnull().sum()MB 0
Food.No 28200
Food.Name 0
Anal.Type 0
Sample.Qualifier 238134
Replicate.No 280576
Element 0
Unit 0
Trace 294997
LOD 1024
LOQ 1024
Result.Qualifier.and.Remarks 251813
Method 73674
Instrument 2
Batch.ID 113099
METHOD 276190
ReplicateNo 336694
Reference.Material 336694
QC.Level 336694
QC.Unit 336694
QC%.Recvd 336694
City.ID 349860
Replicate.No 349562
FiscalYear 349862
CalendarYear 349862
Collection 349862
TDSFoodNumber 349862
TDSFoodDescription 349862
Season 349862
TDSFoodListType 349862
Region 349862
ReportingLimit 349862
Conc_mgkg 0
Year 0
Market_Basket 0
dtype: int64
df[['Food.No', 'Food.Name',
'Replicate.No', 'Element', 'Conc', 'Trace', 'LOD', 'LOQ',
'METHOD', 'Food.No', 'ReplicateNo', 'Replicate.No', 'FiscalYear',
'CalendarYear', 'Collection', 'TDSFoodNumber', 'TDSFoodDescription',
'Season', 'TDSFoodListType', 'Region', 'ReportingLimit', 'Conc_mgkg',
'Year', 'Market_Basket']]| Food.No | Food.No | Food.Name | Replicate.No | Replicate.No | Element | Conc | Trace | LOD | LOQ | METHOD | Food.No | Food.No | ReplicateNo | Replicate.No | Replicate.No | FiscalYear | CalendarYear | Collection | TDSFoodNumber | TDSFoodDescription | Season | TDSFoodListType | Region | ReportingLimit | Conc_mgkg | Year | Market_Basket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | NaN | WHOLE MILK, FLUID | NaN | NaN | ARSENIC | 0.00 | NaN | 0.050 | 0.140 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.00 | 1991 | 03.0 |
| 1 | 1.0 | NaN | WHOLE MILK, FLUID | NaN | NaN | CADMIUM | 0.00 | NaN | 0.002 | 0.005 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.00 | 1991 | 03.0 |
| 2 | 1.0 | NaN | WHOLE MILK, FLUID | NaN | NaN | CALCIUM | 261.00 | NaN | 3.000 | 10.000 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 261.00 | 1991 | 03.0 |
| 3 | 1.0 | NaN | WHOLE MILK, FLUID | NaN | NaN | COPPER | 0.00 | NaN | 0.250 | 0.900 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.00 | 1991 | 03.0 |
| 4 | 1.0 | NaN | WHOLE MILK, FLUID | NaN | NaN | IRON | 0.00 | NaN | 0.700 | 3.000 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.00 | 1991 | 03.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1805 | NaN | 400.0 | TORTILLA, CORN | NaN | NaN | ZINC | 10.10 | NaN | 0.198 | 1.780 | NaN | NaN | 400.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.10 | 2017 | 04.0 |
| 1806 | NaN | 401.0 | BREAD, WHITE ROLL/BUN (HAMBURGER/HOTDOG) | NaN | NaN | ZINC | 7.33 | NaN | 0.198 | 1.780 | NaN | NaN | 401.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7.33 | 2017 | 04.0 |
| 1807 | NaN | 402.0 | RICE, BROWN, COOKED | NaN | NaN | ZINC | 11.60 | NaN | 0.198 | 1.780 | NaN | NaN | 402.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11.60 | 2017 | 04.0 |
| 1808 | NaN | 403.0 | WALNUTS, SHELLED | NaN | NaN | ZINC | 28.30 | NaN | 0.198 | 1.780 | NaN | NaN | 403.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 28.30 | 2017 | 04.0 |
| 1809 | NaN | 404.0 | PIZZA, CHEESE, FAST-FOOD | NaN | NaN | ZINC | 14.50 | NaN | 0.198 | 1.780 | NaN | NaN | 404.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14.50 | 2017 | 04.0 |
349862 rows × 28 columns
df.columnsIndex(['MB', 'Food.No', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
'Replicate.No', 'Element', 'Conc', 'Unit', 'Trace', 'LOD', 'LOQ',
'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
'METHOD', 'Food.No', 'ReplicateNo', 'Reference.Material', 'QC.Level',
'QC.Unit', 'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear',
'CalendarYear', 'Collection', 'TDSFoodNumber', 'TDSFoodDescription',
'Season', 'TDSFoodListType', 'Region', 'ReportingLimit', 'Conc_mgkg',
'Year', 'Market_Basket'],
dtype='object')
ax = sns.boxplot(df[df.Conc > df.LOQ], x='Element', y = 'Conc')
ax.set_yscale("log")
ax.tick_params(axis='x', rotation=90)
fig = px.box(df[df.Conc > df.LOQ], x='Element', y = 'Conc', log_y=True)
fig.show()